Summary
Loading
packages
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggrepel)
library(funFEM)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: fda
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2
theme_set(theme_classic(18) +
theme(legend.position = "bottom"))
data
# total_cases = readr::read_csv("total_cases.csv")
#
# cases_long = total_cases %>%
# dplyr::select(-World) %>%
# tidyr::pivot_longer(cols = -date,
# names_to = "regions",
# values_to = "cum_cases")
rawdata = readr::read_csv("owid-covid-data.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## iso_code = col_character(),
## continent = col_character(),
## location = col_character(),
## date = col_date(format = ""),
## new_tests = col_logical(),
## total_tests = col_logical(),
## total_tests_per_thousand = col_logical(),
## new_tests_per_thousand = col_logical(),
## new_tests_smoothed = col_logical(),
## new_tests_smoothed_per_thousand = col_logical(),
## tests_per_case = col_logical(),
## positive_rate = col_logical(),
## tests_units = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 120294 parsing failures.
## row col expected actual file
## 1074 new_tests 1/0/T/F/TRUE/FALSE 75.0 'owid-covid-data.csv'
## 1074 total_tests 1/0/T/F/TRUE/FALSE 75.0 'owid-covid-data.csv'
## 1074 total_tests_per_thousand 1/0/T/F/TRUE/FALSE 0.008 'owid-covid-data.csv'
## 1074 new_tests_per_thousand 1/0/T/F/TRUE/FALSE 0.008 'owid-covid-data.csv'
## 1074 tests_units 1/0/T/F/TRUE/FALSE tests performed 'owid-covid-data.csv'
## .... ........................ .................. ............... .....................
## See problems(...) for more details.
subdata = rawdata %>%
dplyr::select(iso_code, location, date, new_cases, new_cases_per_million, new_cases_smoothed_per_million, total_cases) %>%
dplyr::filter(complete.cases(new_cases),
complete.cases(new_cases_smoothed_per_million),
location != "World")
cases30_data = subdata %>%
group_by(location) %>%
dplyr::arrange(date) %>%
dplyr::summarise(firstdate_cases30 = date[(new_cases >= 30)] %>% first,
cases30 = new_cases[(new_cases >= 30)] %>% first) %>%
dplyr::filter(complete.cases(firstdate_cases30))
## `summarise()` ungrouping output (override with `.groups` argument)
Visualisation
mergedata = subdata %>%
left_join(cases30_data, by = "location") %>%
dplyr::mutate(
days_since_cases30 = difftime(time1 = date, time2 = firstdate_cases30, units = "days"),
days_since_cases30_int = days_since_cases30 %>% as.integer) %>%
dplyr::filter(as.integer(days_since_cases30) >= 0,
total_cases >= 50000)
mergedata %>%
ggplot(aes(x = days_since_cases30_int,
y = new_cases_per_million,
group = location)) +
# geom_point(alpha = 0.5) +
geom_line() +
scale_y_continuous(trans = "log1p", labels = scales::comma,
breaks = c(1, 10, 1e2, 1e3, 1e4, 1e5, 1e6))
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis

mergedata %>%
ggplot(aes(x = days_since_cases30_int,
y = new_cases_smoothed_per_million,
group = location)) +
# geom_point(alpha = 0.5) +
geom_line() +
scale_y_continuous(trans = "log1p", labels = scales::comma)

Functional clustering example
# library(funHDDC)
# data("trigo")
# dim(trigo)
#
# basis <- create.bspline.basis(c(0,1), nbasis=25)
# var1 <- smooth.basis(argvals=seq(0,1,length.out = 100),y=t(trigo[,1:100]),fdParobj=basis)$fd
#
# res.uni <- funHDDC(var1,K=2,model="AkBkQkDk",init="kmeans",threshold=0.2)
#
# plot(var1)
#
# # matplot(trigo, type = "l")
head(day.5)
CanadianWeather_Temp <- CanadianWeather$dailyAv[,,"Temperature.C"]
head(CanadianWeather_Temp)
dim(CanadianWeather_Temp)
matplot(CanadianWeather_Temp, type = "l")
basis <- create.bspline.basis(c(0, 365), nbasis=21, norder=4) # norder=4 : cubic spline
fdobj <- smooth.basis(day.5, CanadianWeather_Temp,
basis, fdnames=list("Day", "Station", "Deg C"))$fd
res <- funFEM(fdobj, K=4)
# plot(fdobj, col=res$cls, lwd=2, lty=1)
matplot(fdobj$coefs, type = "l", col = res$cls)
Applying functional clustering to COVID
mergedata_wide = mergedata %>%
pivot_wider(id_cols = days_since_cases30_int,
names_from = location,
values_from = new_cases_smoothed_per_million) %>%
dplyr::arrange(days_since_cases30_int)
# mergedata_wide %>%
# summarise_all(.funs = list(~mean(is.na(.))))
mergedata_wide_zeroes_mat = mergedata_wide %>%
dplyr::mutate_all(.funs = coalesce, 0) %>%
as.data.frame %>% tibble::column_to_rownames("days_since_cases30_int") %>% as.matrix()
mergedata_wide_zeroes_mat %>% dim
## [1] 199 50
basis <- create.bspline.basis(c(0, nrow(mergedata_wide_zeroes_mat)), nbasis = 80, norder = 4) # norder=4 : cubic spline
fdobj <- smooth.basis(argvals = seq_len(nrow(mergedata_wide_zeroes_mat)),
y = mergedata_wide_zeroes_mat,
basis)$fd
res <- funFEM(fdobj, K = 20, model = "AB")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# matplot(fdobj$coefs, type = "l", col = res$cls)
plotdf = mergedata %>%
left_join(tibble(
location = colnames(mergedata_wide_zeroes_mat),
cluster = res$cls), by = "location") %>%
group_by(cluster, iso_code) %>%
dplyr::mutate(
label = ifelse(days_since_cases30_int == max(days_since_cases30_int), iso_code, NA))
plotdf %>%
ggplot(aes(x = days_since_cases30_int,
y = new_cases_smoothed_per_million,
group = iso_code)) +
geom_line() +
geom_text_repel(aes(label = label), colour = "#3079ff", fontface = 2) +
facet_wrap(~cluster, scales = "free_y", labeller = label_both) +
labs(x = "Days since first reaching 30 confirmed daily cases",
y = "New cases per million (smoothed)",
title = "Countries with similar COVID-19 new cases trajectories",
subtitle = "(I don't think this clustering is great...)",
caption = "https://ourworldindata.org/coronavirus")
## Warning: Removed 3837 rows containing missing values (geom_text_repel).

Session Info
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Mojave 10.14.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_AU.UTF-8
## ctype en_AU.UTF-8
## tz Australia/Melbourne
## date 2020-08-24
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.8 2020-06-17 [1] CRAN (R 3.6.2)
## blob 1.2.1 2020-01-20 [1] CRAN (R 3.6.0)
## broom 0.7.0 2020-07-09 [1] CRAN (R 3.6.2)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.4 2020-05-27 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 1.0.0 2020-05-29 [1] CRAN (R 3.6.2)
## elasticnet * 1.3 2020-05-15 [1] CRAN (R 3.6.2)
## ellipsis 0.3.1 2020-05-15 [1] CRAN (R 3.6.2)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## fda * 5.1.4 2020-04-20 [1] CRAN (R 3.6.2)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.2)
## fs 1.4.2 2020-06-30 [1] CRAN (R 3.6.2)
## funFEM * 1.1 2015-03-13 [1] CRAN (R 3.6.0)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.2 2020-06-19 [1] CRAN (R 3.6.2)
## ggrepel * 0.8.2 2020-03-08 [1] CRAN (R 3.6.2)
## glue 1.4.1 2020-05-13 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.3.1 2020-06-01 [1] CRAN (R 3.6.2)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.5.0 2020-06-16 [1] CRAN (R 3.6.2)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.7.0 2020-06-25 [1] CRAN (R 3.6.2)
## knitr 1.29 2020-06-23 [1] CRAN (R 3.6.2)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lars * 1.2 2013-04-24 [1] CRAN (R 3.6.0)
## lattice 0.20-41 2020-04-02 [1] CRAN (R 3.6.2)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate * 1.7.9 2020-06-08 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## MASS * 7.3-51.6 2020-04-26 [1] CRAN (R 3.6.2)
## Matrix * 1.2-18 2019-11-27 [1] CRAN (R 3.6.2)
## modelr 0.1.8 2020-05-19 [1] CRAN (R 3.6.2)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## pillar 1.4.6 2020-07-10 [1] CRAN (R 3.6.2)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.5 2020-07-06 [1] CRAN (R 3.6.2)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rlang 0.4.7 2020-07-09 [1] CRAN (R 3.6.2)
## rmarkdown 2.3.1 2020-06-23 [1] Github (rstudio/rmarkdown@b53a85a)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.1 2020-05-11 [1] CRAN (R 3.6.2)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.1)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.3 2020-07-10 [1] CRAN (R 3.6.2)
## tidyr * 1.1.0 2020-05-20 [1] CRAN (R 3.6.2)
## tidyselect 1.1.0 2020-05-11 [1] CRAN (R 3.6.2)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## vctrs 0.3.1 2020-06-05 [1] CRAN (R 3.6.2)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.15 2020-06-21 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.1)
##
## [1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library